Raw data

- GEO number: GSE128615

- Study summary: To determine the influence of differential Kdm6a expression in immune cells, whole transcriptome analysis for CD4+ T cells from WT and Kdm6a cKO mice were performed using RNA-Seq.

Loading packages

- DESeq2: http://bioconductor.org/packages/devel/bioc/vignettes/DESeq2/inst/doc/DESeq2.html

library(data.table)
library(tidyverse)
library(rmarkdown)
library(ggplot2)
library(pheatmap)
library(AnnotationHub)
library(tximport)
library(Rsubread)
library(DESeq2)
library(UpSetR)
library(ensembldb)

Setting AnnotationHub

Assign your species of interest

DB <- "EnsDb"
AnnotationSpecies <- "mus musculus"  # Assign your species 
ah <- AnnotationHub(hub=getAnnotationHubOption("URL"))  # Bring annotation DB

Running AnnotationHub

ahQuery <- query(ah, 
                 pattern=c(DB, AnnotationSpecies), 
                 ignore.case=T)      # Filter annotation of interest


# Select the most recent data
DBName <- mcols(ahQuery) %>%
    rownames() %>%
    tail(1)

# Save the dataset of interest
AnnoDb <- ah[[DBName]] 

# Explore your EnsDb object with following accessors:
# columns(AnnpDb)
# keytypes(AnnoDb)
# keys(AnnoDb, keytype=..)
# select(AnnoDb, keys=.., columns=.., keytype=...)
AnnoKey <- keys(AnnoDb, keytype="TXID")
# Note: Annotation has to be done with not genome but transcripts 
AnnoDb <- select(AnnoDb, 
                 AnnoKey,
                 keytype="TXID",
                 columns=c("GENEID", "GENENAME")) 


# Check if your AnnoDb has been extracted and saved correctely
class(AnnoDb)
## [1] "data.frame"
head(AnnoDb)    # The column 1 has to assign transcript (e.g. ENSEMBLTRANS)
##                 TXID             GENEID GENENAME
## 1 ENSMUST00000000001 ENSMUSG00000000001    Gnai3
## 2 ENSMUST00000000003 ENSMUSG00000000003     Pbsn
## 3 ENSMUST00000000010 ENSMUSG00000020875    Hoxb9
## 4 ENSMUST00000000028 ENSMUSG00000000028    Cdc45
## 5 ENSMUST00000000033 ENSMUSG00000048583     Igf2
## 6 ENSMUST00000000049 ENSMUSG00000000049     Apoh

Metadata setting

# This code chunk needs to be written by yourself 

# Define sample names 
SampleNames <- c(paste0("WT-rep", 1:3), paste0("cKO-rep", 1:3))  

# Aligner names
Aligners <- c("Salmon", "STAR", "HISAT2")

# Define group level
GroupLevel <- c("WT", "cKO")

# Define contrast for DE analysis
Contrast <- c("Group", GroupLevel)

# Set a function for file paths
path.fn <- function(head, tail) { 

    vec <- c(paste0("../",
                    head,    # head = e.g. "hisat2", "star", or "salmon"
                    "_output/",
                    SampleNames,
                    tail))   # tail = file name after SampleNames 

    return(vec)
}


# Define .sf file path
sf <- path.fn("salmon",
              ".salmon_quant/quant.sf")


# Define STAR file path
star <- path.fn("star", 
                "Aligned.sortedByCoord.out.bam")

# Define HISAT2 file path
hisat <- path.fn("hisat2",
                 ".sorted.bam")


# Define sample groups
group <- c(rep(GroupLevel[1], 3), rep(GroupLevel[2], 3))

# Create metadata
metadata <- data.frame(Sample=factor(SampleNames, levels=SampleNames),
                       Group=factor(group, levels=GroupLevel),
                       Salmon_path=sf,
                       STAR_path=star, 
                       HISAT2_path=hisat)

# Assign row names with sample names
rownames(metadata) <- SampleNames


# Explore the metadata
print(metadata)
##            Sample Group                                     Salmon_path
## WT-rep1   WT-rep1    WT  ../salmon_output/WT-rep1.salmon_quant/quant.sf
## WT-rep2   WT-rep2    WT  ../salmon_output/WT-rep2.salmon_quant/quant.sf
## WT-rep3   WT-rep3    WT  ../salmon_output/WT-rep3.salmon_quant/quant.sf
## cKO-rep1 cKO-rep1   cKO ../salmon_output/cKO-rep1.salmon_quant/quant.sf
## cKO-rep2 cKO-rep2   cKO ../salmon_output/cKO-rep2.salmon_quant/quant.sf
## cKO-rep3 cKO-rep3   cKO ../salmon_output/cKO-rep3.salmon_quant/quant.sf
##                                                     STAR_path
## WT-rep1   ../star_output/WT-rep1Aligned.sortedByCoord.out.bam
## WT-rep2   ../star_output/WT-rep2Aligned.sortedByCoord.out.bam
## WT-rep3   ../star_output/WT-rep3Aligned.sortedByCoord.out.bam
## cKO-rep1 ../star_output/cKO-rep1Aligned.sortedByCoord.out.bam
## cKO-rep2 ../star_output/cKO-rep2Aligned.sortedByCoord.out.bam
## cKO-rep3 ../star_output/cKO-rep3Aligned.sortedByCoord.out.bam
##                                   HISAT2_path
## WT-rep1   ../hisat2_output/WT-rep1.sorted.bam
## WT-rep2   ../hisat2_output/WT-rep2.sorted.bam
## WT-rep3   ../hisat2_output/WT-rep3.sorted.bam
## cKO-rep1 ../hisat2_output/cKO-rep1.sorted.bam
## cKO-rep2 ../hisat2_output/cKO-rep2.sorted.bam
## cKO-rep3 ../hisat2_output/cKO-rep3.sorted.bam

featureCounts parameter setting

# "mm10", "mm9", "hg38", or "hg19"
# cf. mm10 = GRCm38
annot.inbuilt <- "mm10" 

# GTF file path
annot.ext <- "../../mouse_reference/gencode.m25.primary_assembly.annotation.gtf"

# annotation type:
# e.g.: "gene_id", "transcript_id", or "gene_name"
GTF.attrType <- "gene_id"

# Number of cores 
nthread <- 16 

# Set a function importing counts from BAM files with featureCounts()
fcounts.fn <- function(vec) {

    fc <- featureCounts(files=vec,   # a vector assigning BAM file paths
                         annot.inbuilt=annot.inbuilt,
                         annot.ext=annot.ext,
                         GTF.attrType=GTF.attrType,
                         isGTFAnnotationFile=T,
                         nthread=nthread, 
                         isPairedEnd=F, # Set this parameter correctly 
                         verbose=T)

    return(fc$counts)
}

Importing counts

Importing Salmon counts

Note:

txi1 <- tximport(…, txOut=F)

txi2 <- tximport(…, txOut=T)

txi2 <- summarizedToGene(…)

counts extracted from txi1 and txi2 are the same

# Import gene level summarized counts 
salmon.txi <- tximport(metadata$Salmon_path,
                       type = "salmon",
                       tx2gene=AnnoDb,
                       ignoreTxVersion=T, 
                       txOut=F)       # TRUE for transcript level, FALSE for gene level 

# Extract the counts and save as a data frame
salmon.counts <- salmon.txi$counts

# Explore the salmon count data frame
head(salmon.counts)
##                        [,1]     [,2]     [,3]    [,4]    [,5]     [,6]
## ENSMUSG00000000001 6492.000 5822.000 5614.000 6040.00 6142.00 5437.000
## ENSMUSG00000000003    0.000    0.000    0.000    0.00    0.00    0.000
## ENSMUSG00000000028 3904.066 3567.989 3714.192 3410.46 3463.04 3142.454
## ENSMUSG00000000031   33.000   45.000   58.000   12.00   28.00   70.000
## ENSMUSG00000000037  119.001   90.001   82.000  165.00  129.00  100.000
## ENSMUSG00000000049    0.000    0.000    0.000    1.00    2.00    0.000
dim(salmon.counts)
## [1] 51956     6
summary(salmon.counts)
##        V1               V2                 V3                 V4          
##  Min.   :     0   Min.   :     0.0   Min.   :     0.0   Min.   :     0.0  
##  1st Qu.:     0   1st Qu.:     0.0   1st Qu.:     0.0   1st Qu.:     0.0  
##  Median :     0   Median :     0.0   Median :     0.0   Median :     0.0  
##  Mean   :   721   Mean   :   642.5   Mean   :   627.2   Mean   :   658.2  
##  3rd Qu.:    22   3rd Qu.:    20.0   3rd Qu.:    20.0   3rd Qu.:    21.2  
##  Max.   :357534   Max.   :322612.0   Max.   :318293.5   Max.   :343971.6  
##        V5                 V6          
##  Min.   :     0.0   Min.   :     0.0  
##  1st Qu.:     0.0   1st Qu.:     0.0  
##  Median :     0.0   Median :     0.0  
##  Mean   :   662.9   Mean   :   568.5  
##  3rd Qu.:    20.1   3rd Qu.:    18.0  
##  Max.   :337557.2   Max.   :305036.9

Importing STAR counts

# Extract counts by running featureCounts() 
star.counts <- fcounts.fn(metadata$STAR_path)
## 
##         ==========     _____ _    _ ____  _____  ______          _____  
##         =====         / ____| |  | |  _ \|  __ \|  ____|   /\   |  __ \ 
##           =====      | (___ | |  | | |_) | |__) | |__     /  \  | |  | |
##             ====      \___ \| |  | |  _ <|  _  /|  __|   / /\ \ | |  | |
##               ====    ____) | |__| | |_) | | \ \| |____ / ____ \| |__| |
##         ==========   |_____/ \____/|____/|_|  \_\______/_/    \_\_____/
##        Rsubread 2.4.0
## 
## //========================== featureCounts setting ===========================\\
## ||                                                                            ||
## ||             Input files : 6 BAM files                                      ||
## ||                                                                            ||
## ||                           WT-rep1Aligned.sortedByCoord.out.bam             ||
## ||                           WT-rep2Aligned.sortedByCoord.out.bam             ||
## ||                           WT-rep3Aligned.sortedByCoord.out.bam             ||
## ||                           cKO-rep1Aligned.sortedByCoord.out.bam            ||
## ||                           cKO-rep2Aligned.sortedByCoord.out.bam            ||
## ||                           cKO-rep3Aligned.sortedByCoord.out.bam            ||
## ||                                                                            ||
## ||              Paired-end : no                                               ||
## ||        Count read pairs : no                                               ||
## ||              Annotation : gencode.m25.primary_assembly.annotation.gtf  ... ||
## ||      Dir for temp files : .                                                ||
## ||                 Threads : 16                                               ||
## ||                   Level : meta-feature level                               ||
## ||      Multimapping reads : counted                                          ||
## || Multi-overlapping reads : not counted                                      ||
## ||   Min overlapping bases : 1                                                ||
## ||                                                                            ||
## \\============================================================================//
## 
## //================================= Running ==================================\\
## ||                                                                            ||
## || Load annotation file gencode.m25.primary_assembly.annotation.gtf ...       ||
## ||    Features : 843712                                                       ||
## ||    Meta-features : 55487                                                   ||
## ||    Chromosomes/contigs : 45                                                ||
## ||                                                                            ||
## || Process BAM file WT-rep1Aligned.sortedByCoord.out.bam...                   ||
## ||                                                                            ||
## || Chromosomes/contigs in input file but not in annotation                    ||
## ||    GL456359.1                                                              ||
## ||    GL456393.1                                                              ||
## ||    GL456389.1                                                              ||
## ||    JH584300.1                                                              ||
## ||    GL456368.1                                                              ||
## ||    GL456394.1                                                              ||
## ||    GL456360.1                                                              ||
## ||    JH584301.1                                                              ||
## ||    GL456390.1                                                              ||
## ||    GL456213.1                                                              ||
## ||    GL456382.1                                                              ||
## ||    GL456378.1                                                              ||
## ||    JH584302.1                                                              ||
## ||    GL456387.1                                                              ||
## ||    GL456370.1                                                              ||
## ||    GL456366.1                                                              ||
## ||    GL456383.1                                                              ||
## ||    GL456379.1                                                              ||
## ||    GL456396.1                                                              ||
## ||    GL456392.1                                                              ||
## ||    GL456367.1                                                              ||
## ||                                                                            ||
## ||    Single-end reads are included.                                          ||
## ||    Total alignments : 70746461                                             ||
## ||    Successfully assigned alignments : 55979059 (79.1%)                     ||
## ||    Running time : 0.07 minutes                                             ||
## ||                                                                            ||
## || Process BAM file WT-rep2Aligned.sortedByCoord.out.bam...                   ||
## ||                                                                            ||
## || Chromosomes/contigs in input file but not in annotation                    ||
## ||    GL456359.1                                                              ||
## ||    GL456393.1                                                              ||
## ||    GL456389.1                                                              ||
## ||    JH584300.1                                                              ||
## ||    GL456368.1                                                              ||
## ||    GL456394.1                                                              ||
## ||    GL456360.1                                                              ||
## ||    JH584301.1                                                              ||
## ||    GL456390.1                                                              ||
## ||    GL456213.1                                                              ||
## ||    GL456382.1                                                              ||
## ||    GL456378.1                                                              ||
## ||    JH584302.1                                                              ||
## ||    GL456387.1                                                              ||
## ||    GL456370.1                                                              ||
## ||    GL456366.1                                                              ||
## ||    GL456383.1                                                              ||
## ||    GL456379.1                                                              ||
## ||    GL456396.1                                                              ||
## ||    GL456392.1                                                              ||
## ||    GL456367.1                                                              ||
## ||                                                                            ||
## ||    Single-end reads are included.                                          ||
## ||    Total alignments : 63394034                                             ||
## ||    Successfully assigned alignments : 50159175 (79.1%)                     ||
## ||    Running time : 0.06 minutes                                             ||
## ||                                                                            ||
## || Process BAM file WT-rep3Aligned.sortedByCoord.out.bam...                   ||
## ||                                                                            ||
## || Chromosomes/contigs in input file but not in annotation                    ||
## ||    GL456359.1                                                              ||
## ||    GL456393.1                                                              ||
## ||    GL456389.1                                                              ||
## ||    JH584300.1                                                              ||
## ||    GL456368.1                                                              ||
## ||    GL456394.1                                                              ||
## ||    GL456360.1                                                              ||
## ||    JH584301.1                                                              ||
## ||    GL456390.1                                                              ||
## ||    GL456213.1                                                              ||
## ||    GL456382.1                                                              ||
## ||    GL456378.1                                                              ||
## ||    JH584302.1                                                              ||
## ||    GL456387.1                                                              ||
## ||    GL456370.1                                                              ||
## ||    GL456366.1                                                              ||
## ||    GL456383.1                                                              ||
## ||    GL456379.1                                                              ||
## ||    GL456396.1                                                              ||
## ||    GL456392.1                                                              ||
## ||    GL456367.1                                                              ||
## ||                                                                            ||
## ||    Single-end reads are included.                                          ||
## ||    Total alignments : 61430033                                             ||
## ||    Successfully assigned alignments : 48522991 (79.0%)                     ||
## ||    Running time : 0.06 minutes                                             ||
## ||                                                                            ||
## || Process BAM file cKO-rep1Aligned.sortedByCoord.out.bam...                  ||
## ||                                                                            ||
## || Chromosomes/contigs in input file but not in annotation                    ||
## ||    GL456359.1                                                              ||
## ||    GL456393.1                                                              ||
## ||    GL456389.1                                                              ||
## ||    JH584300.1                                                              ||
## ||    GL456368.1                                                              ||
## ||    GL456394.1                                                              ||
## ||    GL456360.1                                                              ||
## ||    JH584301.1                                                              ||
## ||    GL456390.1                                                              ||
## ||    GL456213.1                                                              ||
## ||    GL456382.1                                                              ||
## ||    GL456378.1                                                              ||
## ||    JH584302.1                                                              ||
## ||    GL456387.1                                                              ||
## ||    GL456370.1                                                              ||
## ||    GL456366.1                                                              ||
## ||    GL456383.1                                                              ||
## ||    GL456379.1                                                              ||
## ||    GL456396.1                                                              ||
## ||    GL456392.1                                                              ||
## ||    GL456367.1                                                              ||
## ||                                                                            ||
## ||    Single-end reads are included.                                          ||
## ||    Total alignments : 64533984                                             ||
## ||    Successfully assigned alignments : 51163980 (79.3%)                     ||
## ||    Running time : 0.07 minutes                                             ||
## ||                                                                            ||
## || Process BAM file cKO-rep2Aligned.sortedByCoord.out.bam...                  ||
## ||                                                                            ||
## || Chromosomes/contigs in input file but not in annotation                    ||
## ||    GL456359.1                                                              ||
## ||    GL456393.1                                                              ||
## ||    GL456389.1                                                              ||
## ||    JH584300.1                                                              ||
## ||    GL456368.1                                                              ||
## ||    GL456394.1                                                              ||
## ||    GL456360.1                                                              ||
## ||    JH584301.1                                                              ||
## ||    GL456390.1                                                              ||
## ||    GL456213.1                                                              ||
## ||    GL456382.1                                                              ||
## ||    GL456378.1                                                              ||
## ||    JH584302.1                                                              ||
## ||    GL456387.1                                                              ||
## ||    GL456370.1                                                              ||
## ||    GL456366.1                                                              ||
## ||    GL456383.1                                                              ||
## ||    GL456379.1                                                              ||
## ||    GL456396.1                                                              ||
## ||    GL456392.1                                                              ||
## ||    GL456367.1                                                              ||
## ||                                                                            ||
## ||    Single-end reads are included.                                          ||
## ||    Total alignments : 65134349                                             ||
## ||    Successfully assigned alignments : 51790758 (79.5%)                     ||
## ||    Running time : 0.07 minutes                                             ||
## ||                                                                            ||
## || Process BAM file cKO-rep3Aligned.sortedByCoord.out.bam...                  ||
## ||                                                                            ||
## || Chromosomes/contigs in input file but not in annotation                    ||
## ||    GL456359.1                                                              ||
## ||    GL456393.1                                                              ||
## ||    GL456389.1                                                              ||
## ||    JH584300.1                                                              ||
## ||    GL456368.1                                                              ||
## ||    GL456394.1                                                              ||
## ||    GL456360.1                                                              ||
## ||    JH584301.1                                                              ||
## ||    GL456390.1                                                              ||
## ||    GL456213.1                                                              ||
## ||    GL456382.1                                                              ||
## ||    GL456378.1                                                              ||
## ||    JH584302.1                                                              ||
## ||    GL456387.1                                                              ||
## ||    GL456370.1                                                              ||
## ||    GL456366.1                                                              ||
## ||    GL456383.1                                                              ||
## ||    GL456379.1                                                              ||
## ||    GL456396.1                                                              ||
## ||    GL456392.1                                                              ||
## ||    GL456367.1                                                              ||
## ||                                                                            ||
## ||    Single-end reads are included.                                          ||
## ||    Total alignments : 56200709                                             ||
## ||    Successfully assigned alignments : 44128747 (78.5%)                     ||
## ||    Running time : 0.06 minutes                                             ||
## ||                                                                            ||
## || Write the final count table.                                               ||
## || Write the read assignment summary.                                         ||
## ||                                                                            ||
## \\============================================================================//
# Explore the STAR count data frame
head(star.counts)
##                      WT-rep1Aligned.sortedByCoord.out.bam
## ENSMUSG00000102693.1                                    0
## ENSMUSG00000064842.1                                    0
## ENSMUSG00000051951.5                                    0
## ENSMUSG00000102851.1                                    0
## ENSMUSG00000103377.1                                    1
## ENSMUSG00000104017.1                                    0
##                      WT-rep2Aligned.sortedByCoord.out.bam
## ENSMUSG00000102693.1                                    0
## ENSMUSG00000064842.1                                    0
## ENSMUSG00000051951.5                                    0
## ENSMUSG00000102851.1                                    0
## ENSMUSG00000103377.1                                    2
## ENSMUSG00000104017.1                                    0
##                      WT-rep3Aligned.sortedByCoord.out.bam
## ENSMUSG00000102693.1                                    0
## ENSMUSG00000064842.1                                    0
## ENSMUSG00000051951.5                                    0
## ENSMUSG00000102851.1                                    0
## ENSMUSG00000103377.1                                    0
## ENSMUSG00000104017.1                                    2
##                      cKO-rep1Aligned.sortedByCoord.out.bam
## ENSMUSG00000102693.1                                     0
## ENSMUSG00000064842.1                                     0
## ENSMUSG00000051951.5                                     0
## ENSMUSG00000102851.1                                     0
## ENSMUSG00000103377.1                                     0
## ENSMUSG00000104017.1                                     1
##                      cKO-rep2Aligned.sortedByCoord.out.bam
## ENSMUSG00000102693.1                                     0
## ENSMUSG00000064842.1                                     0
## ENSMUSG00000051951.5                                     0
## ENSMUSG00000102851.1                                     0
## ENSMUSG00000103377.1                                     0
## ENSMUSG00000104017.1                                     0
##                      cKO-rep3Aligned.sortedByCoord.out.bam
## ENSMUSG00000102693.1                                     1
## ENSMUSG00000064842.1                                     0
## ENSMUSG00000051951.5                                     0
## ENSMUSG00000102851.1                                     1
## ENSMUSG00000103377.1                                     0
## ENSMUSG00000104017.1                                     0
dim(star.counts)
## [1] 55487     6
summary(star.counts)
##  WT-rep1Aligned.sortedByCoord.out.bam WT-rep2Aligned.sortedByCoord.out.bam
##  Min.   :     0                       Min.   :     0                      
##  1st Qu.:     0                       1st Qu.:     0                      
##  Median :     1                       Median :     1                      
##  Mean   :  1009                       Mean   :   904                      
##  3rd Qu.:    86                       3rd Qu.:    78                      
##  Max.   :811870                       Max.   :470203                      
##  WT-rep3Aligned.sortedByCoord.out.bam cKO-rep1Aligned.sortedByCoord.out.bam
##  Min.   :     0.0                     Min.   :     0.0                     
##  1st Qu.:     0.0                     1st Qu.:     0.0                     
##  Median :     1.0                     Median :     1.0                     
##  Mean   :   874.5                     Mean   :   922.1                     
##  3rd Qu.:    77.0                     3rd Qu.:    82.0                     
##  Max.   :388953.0                     Max.   :459595.0                     
##  cKO-rep2Aligned.sortedByCoord.out.bam cKO-rep3Aligned.sortedByCoord.out.bam
##  Min.   :     0.0                      Min.   :     0.0                     
##  1st Qu.:     0.0                      1st Qu.:     0.0                     
##  Median :     1.0                      Median :     1.0                     
##  Mean   :   933.4                      Mean   :   795.3                     
##  3rd Qu.:    81.0                      3rd Qu.:    70.0                     
##  Max.   :412338.0                      Max.   :390466.0

Importing HISAT2 counts

# Extract counts by running featureCounts() 
hisat2.counts <- fcounts.fn(metadata$HISAT2_path)
## 
##         ==========     _____ _    _ ____  _____  ______          _____  
##         =====         / ____| |  | |  _ \|  __ \|  ____|   /\   |  __ \ 
##           =====      | (___ | |  | | |_) | |__) | |__     /  \  | |  | |
##             ====      \___ \| |  | |  _ <|  _  /|  __|   / /\ \ | |  | |
##               ====    ____) | |__| | |_) | | \ \| |____ / ____ \| |__| |
##         ==========   |_____/ \____/|____/|_|  \_\______/_/    \_\_____/
##        Rsubread 2.4.0
## 
## //========================== featureCounts setting ===========================\\
## ||                                                                            ||
## ||             Input files : 6 BAM files                                      ||
## ||                                                                            ||
## ||                           WT-rep1.sorted.bam                               ||
## ||                           WT-rep2.sorted.bam                               ||
## ||                           WT-rep3.sorted.bam                               ||
## ||                           cKO-rep1.sorted.bam                              ||
## ||                           cKO-rep2.sorted.bam                              ||
## ||                           cKO-rep3.sorted.bam                              ||
## ||                                                                            ||
## ||              Paired-end : no                                               ||
## ||        Count read pairs : no                                               ||
## ||              Annotation : gencode.m25.primary_assembly.annotation.gtf  ... ||
## ||      Dir for temp files : .                                                ||
## ||                 Threads : 16                                               ||
## ||                   Level : meta-feature level                               ||
## ||      Multimapping reads : counted                                          ||
## || Multi-overlapping reads : not counted                                      ||
## ||   Min overlapping bases : 1                                                ||
## ||                                                                            ||
## \\============================================================================//
## 
## //================================= Running ==================================\\
## ||                                                                            ||
## || Load annotation file gencode.m25.primary_assembly.annotation.gtf ...       ||
## ||    Features : 843712                                                       ||
## ||    Meta-features : 55487                                                   ||
## ||    Chromosomes/contigs : 45                                                ||
## ||                                                                            ||
## || Process BAM file WT-rep1.sorted.bam...                                     ||
## ||                                                                            ||
## || Chromosomes/contigs in input file but not in annotation                    ||
## ||    GL456359.1                                                              ||
## ||    GL456393.1                                                              ||
## ||    GL456389.1                                                              ||
## ||    JH584300.1                                                              ||
## ||    GL456368.1                                                              ||
## ||    GL456394.1                                                              ||
## ||    GL456360.1                                                              ||
## ||    JH584301.1                                                              ||
## ||    GL456390.1                                                              ||
## ||    GL456213.1                                                              ||
## ||    GL456382.1                                                              ||
## ||    GL456378.1                                                              ||
## ||    JH584302.1                                                              ||
## ||    GL456387.1                                                              ||
## ||    GL456370.1                                                              ||
## ||    GL456366.1                                                              ||
## ||    GL456383.1                                                              ||
## ||    GL456379.1                                                              ||
## ||    GL456396.1                                                              ||
## ||    GL456392.1                                                              ||
## ||    GL456367.1                                                              ||
## ||                                                                            ||
## ||    Single-end reads are included.                                          ||
## ||    Total alignments : 63363619                                             ||
## ||    Successfully assigned alignments : 48324485 (76.3%)                     ||
## ||    Running time : 0.07 minutes                                             ||
## ||                                                                            ||
## || Process BAM file WT-rep2.sorted.bam...                                     ||
## ||                                                                            ||
## || Chromosomes/contigs in input file but not in annotation                    ||
## ||    GL456359.1                                                              ||
## ||    GL456393.1                                                              ||
## ||    GL456389.1                                                              ||
## ||    JH584300.1                                                              ||
## ||    GL456368.1                                                              ||
## ||    GL456394.1                                                              ||
## ||    GL456360.1                                                              ||
## ||    JH584301.1                                                              ||
## ||    GL456390.1                                                              ||
## ||    GL456213.1                                                              ||
## ||    GL456382.1                                                              ||
## ||    GL456378.1                                                              ||
## ||    JH584302.1                                                              ||
## ||    GL456387.1                                                              ||
## ||    GL456370.1                                                              ||
## ||    GL456366.1                                                              ||
## ||    GL456383.1                                                              ||
## ||    GL456379.1                                                              ||
## ||    GL456396.1                                                              ||
## ||    GL456392.1                                                              ||
## ||    GL456367.1                                                              ||
## ||                                                                            ||
## ||    Single-end reads are included.                                          ||
## ||    Total alignments : 56580014                                             ||
## ||    Successfully assigned alignments : 43217333 (76.4%)                     ||
## ||    Running time : 0.06 minutes                                             ||
## ||                                                                            ||
## || Process BAM file WT-rep3.sorted.bam...                                     ||
## ||                                                                            ||
## || Chromosomes/contigs in input file but not in annotation                    ||
## ||    GL456359.1                                                              ||
## ||    GL456393.1                                                              ||
## ||    GL456389.1                                                              ||
## ||    JH584300.1                                                              ||
## ||    GL456368.1                                                              ||
## ||    GL456394.1                                                              ||
## ||    GL456360.1                                                              ||
## ||    JH584301.1                                                              ||
## ||    GL456390.1                                                              ||
## ||    GL456213.1                                                              ||
## ||    GL456382.1                                                              ||
## ||    GL456378.1                                                              ||
## ||    JH584302.1                                                              ||
## ||    GL456387.1                                                              ||
## ||    GL456370.1                                                              ||
## ||    GL456366.1                                                              ||
## ||    GL456383.1                                                              ||
## ||    GL456379.1                                                              ||
## ||    GL456396.1                                                              ||
## ||    GL456392.1                                                              ||
## ||    GL456367.1                                                              ||
## ||                                                                            ||
## ||    Single-end reads are included.                                          ||
## ||    Total alignments : 54969075                                             ||
## ||    Successfully assigned alignments : 41849987 (76.1%)                     ||
## ||    Running time : 0.06 minutes                                             ||
## ||                                                                            ||
## || Process BAM file cKO-rep1.sorted.bam...                                    ||
## ||                                                                            ||
## || Chromosomes/contigs in input file but not in annotation                    ||
## ||    GL456359.1                                                              ||
## ||    GL456393.1                                                              ||
## ||    GL456389.1                                                              ||
## ||    JH584300.1                                                              ||
## ||    GL456368.1                                                              ||
## ||    GL456394.1                                                              ||
## ||    GL456360.1                                                              ||
## ||    JH584301.1                                                              ||
## ||    GL456390.1                                                              ||
## ||    GL456213.1                                                              ||
## ||    GL456382.1                                                              ||
## ||    GL456378.1                                                              ||
## ||    JH584302.1                                                              ||
## ||    GL456387.1                                                              ||
## ||    GL456370.1                                                              ||
## ||    GL456366.1                                                              ||
## ||    GL456383.1                                                              ||
## ||    GL456379.1                                                              ||
## ||    GL456396.1                                                              ||
## ||    GL456392.1                                                              ||
## ||    GL456367.1                                                              ||
## ||                                                                            ||
## ||    Single-end reads are included.                                          ||
## ||    Total alignments : 57670244                                             ||
## ||    Successfully assigned alignments : 44111169 (76.5%)                     ||
## ||    Running time : 0.06 minutes                                             ||
## ||                                                                            ||
## || Process BAM file cKO-rep2.sorted.bam...                                    ||
## ||                                                                            ||
## || Chromosomes/contigs in input file but not in annotation                    ||
## ||    GL456359.1                                                              ||
## ||    GL456393.1                                                              ||
## ||    GL456389.1                                                              ||
## ||    JH584300.1                                                              ||
## ||    GL456368.1                                                              ||
## ||    GL456394.1                                                              ||
## ||    GL456360.1                                                              ||
## ||    JH584301.1                                                              ||
## ||    GL456390.1                                                              ||
## ||    GL456213.1                                                              ||
## ||    GL456382.1                                                              ||
## ||    GL456378.1                                                              ||
## ||    JH584302.1                                                              ||
## ||    GL456387.1                                                              ||
## ||    GL456370.1                                                              ||
## ||    GL456366.1                                                              ||
## ||    GL456383.1                                                              ||
## ||    GL456379.1                                                              ||
## ||    GL456396.1                                                              ||
## ||    GL456392.1                                                              ||
## ||    GL456367.1                                                              ||
## ||                                                                            ||
## ||    Single-end reads are included.                                          ||
## ||    Total alignments : 58054098                                             ||
## ||    Successfully assigned alignments : 44514481 (76.7%)                     ||
## ||    Running time : 0.06 minutes                                             ||
## ||                                                                            ||
## || Process BAM file cKO-rep3.sorted.bam...                                    ||
## ||                                                                            ||
## || Chromosomes/contigs in input file but not in annotation                    ||
## ||    GL456359.1                                                              ||
## ||    GL456393.1                                                              ||
## ||    GL456389.1                                                              ||
## ||    JH584300.1                                                              ||
## ||    GL456368.1                                                              ||
## ||    GL456394.1                                                              ||
## ||    GL456360.1                                                              ||
## ||    JH584301.1                                                              ||
## ||    GL456390.1                                                              ||
## ||    GL456213.1                                                              ||
## ||    GL456382.1                                                              ||
## ||    GL456378.1                                                              ||
## ||    JH584302.1                                                              ||
## ||    GL456387.1                                                              ||
## ||    GL456370.1                                                              ||
## ||    GL456366.1                                                              ||
## ||    GL456383.1                                                              ||
## ||    GL456379.1                                                              ||
## ||    GL456396.1                                                              ||
## ||    GL456392.1                                                              ||
## ||    GL456367.1                                                              ||
## ||                                                                            ||
## ||    Single-end reads are included.                                          ||
## ||    Total alignments : 50205194                                             ||
## ||    Successfully assigned alignments : 38062143 (75.8%)                     ||
## ||    Running time : 0.06 minutes                                             ||
## ||                                                                            ||
## || Write the final count table.                                               ||
## || Write the read assignment summary.                                         ||
## ||                                                                            ||
## \\============================================================================//
# Explore the HISAT2 count data frame
head(hisat2.counts)
##                      WT-rep1.sorted.bam WT-rep2.sorted.bam WT-rep3.sorted.bam
## ENSMUSG00000102693.1                  0                  0                  0
## ENSMUSG00000064842.1                  0                  0                  0
## ENSMUSG00000051951.5                  0                  0                  0
## ENSMUSG00000102851.1                  0                  0                  0
## ENSMUSG00000103377.1                  1                  2                  0
## ENSMUSG00000104017.1                  0                  0                  2
##                      cKO-rep1.sorted.bam cKO-rep2.sorted.bam
## ENSMUSG00000102693.1                   0                   0
## ENSMUSG00000064842.1                   0                   0
## ENSMUSG00000051951.5                   0                   0
## ENSMUSG00000102851.1                   0                   0
## ENSMUSG00000103377.1                   0                   0
## ENSMUSG00000104017.1                   1                   0
##                      cKO-rep3.sorted.bam
## ENSMUSG00000102693.1                   1
## ENSMUSG00000064842.1                   0
## ENSMUSG00000051951.5                   0
## ENSMUSG00000102851.1                   1
## ENSMUSG00000103377.1                   0
## ENSMUSG00000104017.1                   0
dim(hisat2.counts)
## [1] 55487     6
summary(hisat2.counts)
##  WT-rep1.sorted.bam WT-rep2.sorted.bam WT-rep3.sorted.bam cKO-rep1.sorted.bam
##  Min.   :     0.0   Min.   :     0.0   Min.   :     0.0   Min.   :     0.0   
##  1st Qu.:     0.0   1st Qu.:     0.0   1st Qu.:     0.0   1st Qu.:     0.0   
##  Median :     1.0   Median :     1.0   Median :     1.0   Median :     1.0   
##  Mean   :   870.9   Mean   :   778.9   Mean   :   754.2   Mean   :   795.0   
##  3rd Qu.:    85.0   3rd Qu.:    77.0   3rd Qu.:    76.0   3rd Qu.:    81.5   
##  Max.   :755391.0   Max.   :436946.0   Max.   :361243.0   Max.   :428302.0   
##  cKO-rep2.sorted.bam cKO-rep3.sorted.bam
##  Min.   :     0.0    Min.   :     0     
##  1st Qu.:     0.0    1st Qu.:     0     
##  Median :     1.0    Median :     1     
##  Mean   :   802.3    Mean   :   686     
##  3rd Qu.:    81.0    3rd Qu.:    69     
##  Max.   :383887.0    Max.   :363272

Data cleaning: sample and gene annotation

countList <- list(salmon.counts, 
                  star.counts,
                  hisat2.counts)

# Assign names of the count data frames in the count list
names(countList) <- Aligners

# Set a function cleaning the count data frame
clean.fn <- function(df) {

    # Convert to a data frame
    df <- as.data.frame(df)

    # Assign column names
    names(df) <- SampleNames

    # Bring row names to a column
    df <- df %>% rownames_to_column(var="GENEID")

    return(df)
}


# Set a function to drop GENEID version
clean.annotation.fn <- function(df) {

    # Re-annotate without version specification
    df <- separate(df, "GENEID", c("GENEID", "Version"))

    # Remove version column
    df <- df[, colnames(df) != "Version"]

    return(df)
}

# Move GENEID to a column
for (x in Aligners) {

    countList[[x]] <- clean.fn(countList[[x]])

}


# Remove version of GENEID and duplicated rows in STAR & HISAT2 count tables
for (x in Aligners) {

    countList[[x]] <- clean.annotation.fn(countList[[x]]) %>% 

        distinct()

}



# Explore the cleaned count data frames 
head(countList[[1]])
##               GENEID  WT-rep1  WT-rep2  WT-rep3 cKO-rep1 cKO-rep2 cKO-rep3
## 1 ENSMUSG00000000001 6492.000 5822.000 5614.000  6040.00  6142.00 5437.000
## 2 ENSMUSG00000000003    0.000    0.000    0.000     0.00     0.00    0.000
## 3 ENSMUSG00000000028 3904.066 3567.989 3714.192  3410.46  3463.04 3142.454
## 4 ENSMUSG00000000031   33.000   45.000   58.000    12.00    28.00   70.000
## 5 ENSMUSG00000000037  119.001   90.001   82.000   165.00   129.00  100.000
## 6 ENSMUSG00000000049    0.000    0.000    0.000     1.00     2.00    0.000
head(countList[[2]])
##               GENEID WT-rep1 WT-rep2 WT-rep3 cKO-rep1 cKO-rep2 cKO-rep3
## 1 ENSMUSG00000102693       0       0       0        0        0        1
## 2 ENSMUSG00000064842       0       0       0        0        0        0
## 3 ENSMUSG00000051951       0       0       0        0        0        0
## 4 ENSMUSG00000102851       0       0       0        0        0        1
## 5 ENSMUSG00000103377       1       2       0        0        0        0
## 6 ENSMUSG00000104017       0       0       2        1        0        0
head(countList[[3]])
##               GENEID WT-rep1 WT-rep2 WT-rep3 cKO-rep1 cKO-rep2 cKO-rep3
## 1 ENSMUSG00000102693       0       0       0        0        0        1
## 2 ENSMUSG00000064842       0       0       0        0        0        0
## 3 ENSMUSG00000051951       0       0       0        0        0        0
## 4 ENSMUSG00000102851       0       0       0        0        0        1
## 5 ENSMUSG00000103377       1       2       0        0        0        0
## 6 ENSMUSG00000104017       0       0       2        1        0        0
dim(countList[[1]])
## [1] 51956     7
dim(countList[[2]])
## [1] 55487     7
dim(countList[[3]])
## [1] 55487     7
sum(duplicated(countList[[1]]))
## [1] 0
sum(duplicated(countList[[2]]))
## [1] 0
sum(duplicated(countList[[3]]))
## [1] 0
# Convert Salmon counts to integers 
countList[["Salmon"]] <- cbind(GENEID=countList[["Salmon"]][, "GENEID"],
                               round(countList[["Salmon"]][, 
                               colnames(countList[["Salmon"]]) %in% SampleNames]))

# Explore the cleaned count data frames 
head(countList[[1]])
##               GENEID WT-rep1 WT-rep2 WT-rep3 cKO-rep1 cKO-rep2 cKO-rep3
## 1 ENSMUSG00000000001    6492    5822    5614     6040     6142     5437
## 2 ENSMUSG00000000003       0       0       0        0        0        0
## 3 ENSMUSG00000000028    3904    3568    3714     3410     3463     3142
## 4 ENSMUSG00000000031      33      45      58       12       28       70
## 5 ENSMUSG00000000037     119      90      82      165      129      100
## 6 ENSMUSG00000000049       0       0       0        1        2        0

Plotting sequencing depth

Number of total counts per sample

# Set a function generating a data frame with sequencing depth
seq.depth.fn <- function(df, aligner) {

    seqdf <- as.data.frame(colSums(df[, SampleNames])) %>% 
        rownames_to_column (var="Sample") %>% 
        mutate(Aligner=aligner)

    names(seqdf) <- c("Sample", "Count", "Aligner")

    return(seqdf)
}

# Set a function for a bar plot comparing values
comparing.barplot.fn <- function(df, yval, title, ytitle) {

    ggplot(df, 
       aes(x=Sample, y=yval, group=Aligner, fill=Aligner)) +
    geom_bar(stat="identity", position="dodge") +
    theme_bw() +
    ggtitle(title) + 
    ylab(ytitle)

}




# Initialize the seq depth data frame with the first aligner
seq.depth.df <- seq.depth.fn(countList[[1]], Aligners[1])

# Extend the seq depth data frame with the rest of aligners
for (x in Aligners) {

    if (x %in% Aligners[2:length(Aligners)]) {

        seq.depth.df <- rbind(seq.depth.df, 
                              seq.depth.fn(countList[[x]], x))
    }
}

# Explore how the data frame 
print(seq.depth.df)
##      Sample    Count Aligner
## 1   WT-rep1 37460304  Salmon
## 2   WT-rep2 33384135  Salmon
## 3   WT-rep3 32585959  Salmon
## 4  cKO-rep1 34197855  Salmon
## 5  cKO-rep2 34442159  Salmon
## 6  cKO-rep3 29538115  Salmon
## 7   WT-rep1 55979059    STAR
## 8   WT-rep2 50159175    STAR
## 9   WT-rep3 48522991    STAR
## 10 cKO-rep1 51163980    STAR
## 11 cKO-rep2 51790758    STAR
## 12 cKO-rep3 44128747    STAR
## 13  WT-rep1 48324485  HISAT2
## 14  WT-rep2 43217333  HISAT2
## 15  WT-rep3 41849987  HISAT2
## 16 cKO-rep1 44111169  HISAT2
## 17 cKO-rep2 44514481  HISAT2
## 18 cKO-rep3 38062143  HISAT2
summary(seq.depth.df)
##     Sample              Count            Aligner         
##  Length:18          Min.   :29538115   Length:18         
##  Class :character   1st Qu.:35196695   Class :character  
##  Mode  :character   Median :43664251   Mode  :character  
##                     Mean   :42412935                     
##                     3rd Qu.:48473364                     
##                     Max.   :55979059
# Convert character vectors to factors
seq.depth.df$Sample <- factor(seq.depth.df$Sample, 
                              levels=SampleNames)
seq.depth.df$Aligner <- factor(seq.depth.df$Aligner, 
                               levels=Aligners)

# Create a plot presenting sequencing depth
comparing.barplot.fn(seq.depth.df, 
                     seq.depth.df$Count, 
                     "Sequencing Depth by Sample and Aligner", 
                     "Count")

Generating DESeq2 objects

- vst() was run to perform variance stabilizing transformation instead of rlog() which takes longer time with similar characteristics.

- The vsd object created by vst() is used for not DE analysis but QC.

- References: DESeq2 doc “Transcript abundance files”, DESeq2 doc “Variance stabilizing transformation”

# Initialize new lists for storing dds objects
ddsList <- countList

# Initialize new lists for storing vsd objects
vsdList <- countList


for (x in Aligners) {

    # Create a count matrix from the count data frame 
    m <- countList[[x]][, colnames(countList[[x]]) != "GENEID"] %>% 
        as.matrix()

    # Assigne row names
    rownames(m) <- countList[[x]]$GENEID

    # Generate a DESeq2 object
    ddsList[[x]] <- DESeqDataSetFromMatrix(m, 
                                           colData=metadata, 
                                           design=~Group) 

    # Conduct vst
    vsdList[[x]] <- varianceStabilizingTransformation(ddsList[[x]], 
                                                      blind=TRUE) 
}

# Explore generated objects
summary(ddsList)
##        Length Class        Mode
## Salmon 51956  DESeqDataSet S4  
## STAR   55487  DESeqDataSet S4  
## HISAT2 55487  DESeqDataSet S4
summary(vsdList)
##        Length Class          Mode
## Salmon 51956  DESeqTransform S4  
## STAR   55487  DESeqTransform S4  
## HISAT2 55487  DESeqTransform S4

Estimating size factors

- black dashed line: size factor = 1

- Reference: DESeq2 doc “Sample-/gene-dependent normalization factors”

# Calculate and add size factors to the DEseq object
for (x in Aligners) {

    ddsList[[x]] <- estimateSizeFactors(ddsList[[x]])

}

# Set a function summarizing size factors by aligner to a data frame
sfactor.fn <- function(df, aligner) {

    sizefactor <- as.data.frame(round(sizeFactors(df), 3)) %>%
        rownames_to_column(var="Sample") %>%
        mutate(Aligner=aligner)

    names(sizefactor) <- c("Sample", "Size_Factor", "Aligner")

    return(sizefactor)

}

# Initialize a data frame with the first aligner 
size.factor.df <- sfactor.fn(ddsList[[1]], Aligners[1])


for (x in Aligners) {

    if (x != Aligners[1]) {

        size.factor.df <- rbind(size.factor.df, 
                                sfactor.fn(ddsList[[x]], x))
    }
}


# Explore the data frame
print(size.factor.df)
##      Sample Size_Factor Aligner
## 1   WT-rep1       1.110  Salmon
## 2   WT-rep2       0.998  Salmon
## 3   WT-rep3       0.980  Salmon
## 4  cKO-rep1       1.020  Salmon
## 5  cKO-rep2       1.034  Salmon
## 6  cKO-rep3       0.894  Salmon
## 7   WT-rep1       1.107    STAR
## 8   WT-rep2       1.000    STAR
## 9   WT-rep3       0.977    STAR
## 10 cKO-rep1       1.019    STAR
## 11 cKO-rep2       1.038    STAR
## 12 cKO-rep3       0.891    STAR
## 13  WT-rep1       1.107  HISAT2
## 14  WT-rep2       1.000  HISAT2
## 15  WT-rep3       0.976  HISAT2
## 16 cKO-rep1       1.019  HISAT2
## 17 cKO-rep2       1.038  HISAT2
## 18 cKO-rep3       0.891  HISAT2
# Convert character vectors to factors
size.factor.df$Sample <- factor(size.factor.df$Sample, 
                              levels=SampleNames)
size.factor.df$Aligner <- factor(size.factor.df$Aligner, 
                               levels=Aligners)

# Plot calculated size factors
comparing.barplot.fn(size.factor.df, 
                     size.factor.df$Size_Factor,  
                     "Size Factors by Aligner and Sample", 
                     "Size Factor") + geom_hline(yintercept=1, linetype="dashed", color="black", size=1)

Estimating dispersion and Wald test

- Dispersion is calculated as a measure of variation instead of variance since variance gets larger when gene expression gets higher.

- Wald test is the default setting of DESeq2 which tests null hypothesis between two groups. You should use Likelihood ratio test (LRT) when comparing more than two groups.

- References: Harvard Chan Bioinformatics Core workshop I, Harvard Chan Bioinformatics Core workshop II, Harvard Chan Bioinformatics Core workshop III, DESeq2 "Wald test indivisual steps, DESeq2 doc “Likelihood ratio test”

for (x in Aligners) {

    # Dispersion
    ddsList[[x]] <- estimateDispersions(ddsList[[x]])
    
    # Wald test
    ddsList[[x]] <- nbinomWaldTest(ddsList[[x]])

}


# Explore generated data in the dds object 
ddsList[[1]]
## class: DESeqDataSet 
## dim: 51956 6 
## metadata(1): version
## assays(4): counts mu H cooks
## rownames(51956): ENSMUSG00000000001 ENSMUSG00000000003 ...
##   ENSMUSG00001118662 ENSMUSG00001118663
## rowData names(22): baseMean baseVar ... deviance maxCooks
## colnames(6): WT-rep1 WT-rep2 ... cKO-rep2 cKO-rep3
## colData names(6): Sample Group ... HISAT2_path sizeFactor

Sample QC: Principal Component Analysis (PCA)

Identifies source of variation and sample outliers

- Reference: DESeq2 doc “Principal component plot of the samples”, DESeq2 doc “Heatmap of the sample-to-sample distances”

# Assigne what to compare
GroupOfInterest <- Contrast[1]


# Set a function for sample pca
qcpca.fn <- function(obj, title) {

    plotPCA(obj,
        intgroup=GroupOfInterest,
        returnData=FALSE) + theme_bw() + ggtitle(paste("PCA:", title)) 

}

# Print the plots
qcpca.fn(vsdList[[1]], Aligners[1]) 

qcpca.fn(vsdList[[2]], Aligners[2])

qcpca.fn(vsdList[[3]], Aligners[3]) 

Sample QC: Sample Correlation Heatmap

Identifies distance between samples & correlation in a group

# Heatmap annotation
HeatmapAnno <- metadata[, c("Sample", "Group")]

# Set a function generating a correlation heatmap
cheatmap.fn <- function(df, title) {

    # Extract a normalized count matrix
    vm <- assay(df)

    # Generate a correlation matrix
    cm <- cor(vm)

    # Generate a heatmap
    pheatmap(cm, 
             annotation=HeatmapAnno, 
             main=paste("Sample Correlation Heatmap:", title))
}


# Print the heatmaps
cheatmap.fn(vsdList[[1]], Aligners[1])

cheatmap.fn(vsdList[[2]], Aligners[2])

cheatmap.fn(vsdList[[3]], Aligners[3])

Running DE analysis

# Run DESeq 
for (x in Aligners) {

    ddsList[[x]] <- DESeq(ddsList[[x]])
    # Check result names 
    ResNames <- resultsNames(ddsList[[x]])
    print(ResNames)

}
## [1] "Intercept"       "Group_cKO_vs_WT"
## [1] "Intercept"       "Group_cKO_vs_WT"
## [1] "Intercept"       "Group_cKO_vs_WT"

Creating dispersion plots

- Dispersion is important since estimation by DESeq2 algorithm is based on the assumption that genes with similar expression levels have similar dispersion. If an RNA-seq dataset doesn’t satisfy this assumption, use other DE algorithms than DESeq2.

- References: DESeq2 doc "Dispersion plot and fitting alternatives, Harvard Chan Bioinformatics Core workshop

# Set a function plotting dispersion
dplot.fn <- function(dds, title) {

    plotDispEsts(dds, 
                 main=paste("Dispersion over Counts:", title))
}

# Plot dispersion patterns
dplot.fn(ddsList[[1]], Aligners[1])

dplot.fn(ddsList[[2]], Aligners[2])

dplot.fn(ddsList[[3]], Aligners[3])

# Do they fit well with the DESeq2 estimation model?

Setting how to extract fold-change results

Change variables below

- The alpha denotes threshold of false discovery rate (FDR) assigned by users.

- In this analysis, the alpha is set to 0.1

# Set FDR threshold alpha
alpha=0.1

# Set the coefficients to compare 
Coef <- ResNames[-1]
print(Coef) 
## [1] "Group_cKO_vs_WT"
# Set a function to clean a result table 
lfctable.fn <- function(df) {
    df <- df %>% 
        rownames_to_column(var="GENEID") %>%
        mutate(FDR=ifelse(padj < 0.1 & !is.na(padj), 
                                   "< 0.1", 
                                   "> 0.1")) 
    return(df)
}

# Set a function extracting results
extract.lfc.fn <- function(dds) {

    res <- results(dds, contrast=Contrast, alpha=alpha)
    lfctable.fn(as.data.frame(res))

    return(lfctable.fn(as.data.frame(res)))


}

Extracting log2FoldChanges

You can change alpha depending on your interest of FDR level

Shrinkage is NOT applied in this analysis

# Initialize a list storing lfc data frames
lfcList <- countList

# Extract DE results
# The Contrast variable was defined in the previous chunk
# Extraction with no shrinkage
# alpha: FDR threshold
for (x in Aligners) {

    lfcList[[x]] <- extract.lfc.fn(ddsList[[x]]) %>% mutate(Alignment=x)

    print(head(lfcList[[x]]))

}
##               GENEID     baseMean log2FoldChange      lfcSE       stat
## 1 ENSMUSG00000000001 5892.4376208    -0.04367298 0.05134311 -0.8506103
## 2 ENSMUSG00000000003    0.0000000             NA         NA         NA
## 3 ENSMUSG00000000028 3514.8301315     0.09209382 0.06403838  1.4381034
## 4 ENSMUSG00000000031   41.8581466     0.19865308 0.59900127  0.3316405
## 5 ENSMUSG00000000037  113.2455289    -0.50352734 0.23268971 -2.1639433
## 6 ENSMUSG00000000049    0.4857333    -2.44912383 3.56194529 -0.6875804
##      pvalue      padj   FDR Alignment
## 1 0.3949859 0.7963511 > 0.1    Salmon
## 2        NA        NA > 0.1    Salmon
## 3 0.1504047 0.5392314 > 0.1    Salmon
## 4 0.7401607 0.9435540 > 0.1    Salmon
## 5 0.0304687 0.2269216 > 0.1    Salmon
## 6 0.4917170        NA > 0.1    Salmon
##               GENEID  baseMean log2FoldChange    lfcSE       stat    pvalue
## 1 ENSMUSG00000102693 0.1870490     -1.0278145 4.080473 -0.2518861 0.8011291
## 2 ENSMUSG00000064842 0.0000000             NA       NA         NA        NA
## 3 ENSMUSG00000051951 0.0000000             NA       NA         NA        NA
## 4 ENSMUSG00000102851 0.1870490     -1.0278145 4.080473 -0.2518861 0.8011291
## 5 ENSMUSG00000103377 0.4837499      2.3680696 3.152915  0.7510730 0.4526087
## 6 ENSMUSG00000104017 0.5047301      0.8868952 3.201764  0.2770020 0.7817786
##   padj   FDR Alignment
## 1   NA > 0.1      STAR
## 2   NA > 0.1      STAR
## 3   NA > 0.1      STAR
## 4   NA > 0.1      STAR
## 5   NA > 0.1      STAR
## 6   NA > 0.1      STAR
##               GENEID  baseMean log2FoldChange    lfcSE       stat    pvalue
## 1 ENSMUSG00000102693 0.1870355     -1.0280029 4.080473 -0.2519323 0.8010934
## 2 ENSMUSG00000064842 0.0000000             NA       NA         NA        NA
## 3 ENSMUSG00000051951 0.0000000             NA       NA         NA        NA
## 4 ENSMUSG00000102851 0.1870355     -1.0280029 4.080473 -0.2519323 0.8010934
## 5 ENSMUSG00000103377 0.4838032      2.3677771 3.123686  0.7580073 0.4484466
## 6 ENSMUSG00000104017 0.5050522      0.8868523 3.167217  0.2800099 0.7794699
##   padj   FDR Alignment
## 1   NA > 0.1    HISAT2
## 2   NA > 0.1    HISAT2
## 3   NA > 0.1    HISAT2
## 4   NA > 0.1    HISAT2
## 5   NA > 0.1    HISAT2
## 6   NA > 0.1    HISAT2
# Initialize a data frame storing total lfc results across the aligners
lfc.dataframe <- lfcList[[1]] 

for (x in Aligners[2:length(Aligners)]) {

    lfc.dataframe <- rbind(lfc.dataframe, 
                           lfcList[[x]])

}


lfc.dataframe$Alignment <- factor(lfc.dataframe$Alignment, 
                                  levels=Aligners)

Exploring distribution of false discovery rate (FDR)

Black dashed line: FDR = 0.1

# Plot distribution of FDR 
ggplot(lfc.dataframe, 
       aes(x=padj, y=..count.., color=Alignment)) + 
    geom_density(size=1) + 
    theme_bw() + 
    scale_x_log10() + 
    ggtitle("Distribution of False Discovery Rate (FDR) by Aligner") + 
    ylab("Count") +
    xlim(0.00001, 1) + 
    geom_vline(xintercept=alpha, 
               color="black", 
               size=1, linetype="dashed") + scale_x_continuous(breaks=seq(0, 1, by=0.1))

Presenting distribution of log2FoldChange

Black: total genes (padj =/= NA)

Colored: genes above or below FDR=0.1

valid.lfc.df <- subset(lfc.dataframe, FDR == "< 0.1")

ggplot(valid.lfc.df,
       aes(x=log2FoldChange,
           y=..count.., 
           color=Alignment)) +
geom_density(size=1) + 
theme_bw() + 
geom_vline(xintercept=c(-1, 1), 
           linetype="dashed", color="black", size=1) + 
ggtitle("Distribution of log2FoldChange Values by Aligner (FDR < 0.1)") +
ylab("Count") + 
xlim(-10, 10)    # Change xlim by datatype

Exploring mean-difference with an MA plot

- x-axis: expression level (baseMean))

- y-axis: fold change (log2FoldChange)

- Red dashed lines: log2FoldChange = -1 and 1

# Set ylim: has to adjusted by users depending on data 
yl <- c(-10, 10)

# Set min log2 fold change of interest 
mLog <- c(-1, 1)

# Create MA plots by Aligner
ggplot(lfc.dataframe, aes(x=baseMean, y=log2FoldChange, color=FDR)) + 
        geom_point() + 
        facet_grid(~Alignment) +
        scale_x_log10() + 
        theme_bw() + 
        scale_color_manual(values=c("blue", "grey")) + 
        ggtitle(paste("MA plot")) + 
        ylim(yl[1], yl[2]) + 
        theme(strip.text.x=element_text(size=10)) +
        geom_hline(yintercept=c(mLog[1], mLog[2]), 
                   linetype="dashed", color="red") 

Exploring expression profiling with normalized count data

- Normalized count matrices are extracted from dds objects and filtered with thresholds set at FDR and log2FoldChange

- The heatmaps display z-scores of the normalized counts

- In this analysis, mLog = 1

- References: Harvard Chan Bioinformatics Core workshop, DESeq2 doc “Heatmap of the count matrix”

# Initialize a list 
heatmap.df.List <- lfcList

# Filter genes with FDR < alpha and absolute log2FoldChange > 1
for (x in Aligners) {

    # Set a logical vector filtering FDR below alpha
    is.fdr.valid <- lfcList[[x]]$FDR == paste("<", alpha)

    # Set a logical vector filtering absolute lfc above 1 
    is.lfc.large <- abs(lfcList[[x]]$log2FoldChange) > mLog[2]

    # Extract total normalized counts
    norm.counts <- counts(ddsList[[x]], normalized=T)

    # Save filtered genes only from the normalized count data
    heatmap.df.List[[x]] <- norm.counts[is.fdr.valid & is.lfc.large,]

}

# Explore the cleaned data frames 
head(heatmap.df.List[[1]])
##                     WT-rep1  WT-rep2  WT-rep3   cKO-rep1   cKO-rep2   cKO-rep3
## ENSMUSG00000000125 17.11371 17.03764 12.24046  19.617050  81.208977  48.112902
## ENSMUSG00000000303 23.41876 21.04649 40.80153   6.865968   4.833868  10.070142
## ENSMUSG00000000673 28.82309 29.06421 31.62119  11.770230  15.468377  11.189047
## ENSMUSG00000000686 85.56853 80.17712 84.66318  39.234100  35.770621  32.448236
## ENSMUSG00000000869 60.34833 64.14170 15.30058  22.559608   8.700962   6.713428
## ENSMUSG00000000957 24.31948 24.05314 15.30058 100.046956 164.351502 111.890470
head(heatmap.df.List[[2]])
##                       WT-rep1    WT-rep2    WT-rep3   cKO-rep1   cKO-rep2
## ENSMUSG00000025929  151.77009   57.97408   16.37872   7.848327   7.705361
## ENSMUSG00000041872  131.89543   62.97184   16.37872  14.715613  14.447553
## ENSMUSG00000047180 1469.82104 1545.30903 1586.68816 626.885122 470.027043
## ENSMUSG00000026072   49.68664   46.97899   57.32551  10.791450   5.779021
## ENSMUSG00000026070  542.93943  570.74480  901.85308 417.923415 242.718883
## ENSMUSG00000025969   33.42556   36.98346   30.71009  60.824535  78.016784
##                      cKO-rep3
## ENSMUSG00000025929   5.611470
## ENSMUSG00000041872   8.978352
## ENSMUSG00000047180 622.873185
## ENSMUSG00000026072  14.589822
## ENSMUSG00000026070 253.638450
## ENSMUSG00000025969  65.093054
head(heatmap.df.List[[3]])
##                       WT-rep1    WT-rep2    WT-rep3   cKO-rep1   cKO-rep2
## ENSMUSG00000025929  150.82592   57.99044   16.38913   7.853379   7.708966
## ENSMUSG00000041872  127.34404   58.99027   12.29185  13.743413  15.417932
## ENSMUSG00000047180 1458.58602 1537.74640 1576.42924 623.361945 468.319699
## ENSMUSG00000026072   49.67321   45.99242   57.36195  10.798396   5.781725
## ENSMUSG00000026070  525.63286  555.90832  872.72106 409.357372 235.123470
## ENSMUSG00000025969   34.31967   37.99373   32.77826  57.918669  82.871387
##                      cKO-rep3
## ENSMUSG00000025929   5.611066
## ENSMUSG00000041872   8.977705
## ENSMUSG00000047180 617.217229
## ENSMUSG00000026072  14.588771
## ENSMUSG00000026070 246.886892
## ENSMUSG00000025969  70.699428
dim(heatmap.df.List[[1]])
## [1] 237   6
dim(heatmap.df.List[[2]])
## [1] 236   6
dim(heatmap.df.List[[3]])
## [1] 240   6
pheatmap(heatmap.df.List[[3]], 
         annotation=HeatmapAnno,
         scale="row",
         show_rownames=F)

# Set a function creating a profiling heatmap
profile.heatmap.fn <- function(df, title) {

    pheatmap(df, 
             annotation=HeatmapAnno, 
             scale="row", 
             show_rownames=F,
             main=paste("Expression Profiling by", title, "(FDR < 0.1, absolute log2FoldChange > 1)"))
}

# Print the expression heatmaps
profile.heatmap.fn(heatmap.df.List[[Aligners[1]]], Aligners[1])

profile.heatmap.fn(heatmap.df.List[[Aligners[2]]], Aligners[2])

profile.heatmap.fn(heatmap.df.List[[Aligners[3]]], Aligners[3])

NA statistics: zero count genes & outlier genes

When NAs appear in

- log2FoldChange: zero counts in all samples

- padj: too little information

- pval & padj: at least one replicate was an outlier

# Count number of NA genes  
type=c("Zero Counts", "Outliers", "Total NA Genes") 

# Create a data frame storing number of NA genes by type
NA.genes <- lfc.dataframe %>% 
    group_by(Alignment) %>% 
    summarize(zero=sum(is.na(log2FoldChange)), 
              outlier=sum(is.na(pvalue) & is.na(padj))) %>% 
    mutate(total=zero + outlier) %>%
    gather(Type, Number, -Alignment) %>% 
    mutate(Type=factor(case_when(Type == "zero" ~ type[1],
                                 Type == "outlier" ~ type[2],
                                 Type == "total" ~ type[3]),
                       levels=type))

# Plot number of NA genes 
ggplot(NA.genes, 
       aes(x=Type, y=Number, group=Alignment, fill=Alignment, label=Number)) + 
    geom_bar(stat="identity", position="dodge") + 
    theme_bw() +
    geom_text(position=position_dodge(width=1), vjust=1.5) + 
    ggtitle("Number of NA Genes") + 
    ylab("Number of Genes")

Ranking DEGs by alginer

- fdr.rank: ranked by FDR

- lfc.rank: ranked by absolute fold change

- up.lfc.rank: ranked by magnitude of fold change increase

- down.lfc.rank: ranked by manitude of fold change decrease

# Create a new list having DE table with FDR below alpha
fdr.rank <- lfcList
lfc.rank <- lfcList
up.lfc.rank <- lfcList
down.lfc.rank <- lfcList

# Set a sorting genes with FDR below alpha 
filter.fdr.fn <- function(df) {as.data.table(df[df$FDR == paste("<", alpha),])}

# Set a function creating a column assigning ranking 
Ranking.fn <- function(x) {mutate(x, Rank=1:nrow(x))}


for (x in Aligners) {

    rdf <- lfcList[[x]]

    fdr.rank[[x]] <- filter.fdr.fn(rdf) %>% arrange(padj) %>% Ranking.fn()

    lfc.rank[[x]] <- filter.fdr.fn(rdf) %>% arrange(desc(abs(log2FoldChange))) %>% Ranking.fn()

    up.lfc.rank[[x]] <- filter.fdr.fn(rdf) %>% arrange(desc(log2FoldChange)) %>% Ranking.fn() 

    down.lfc.rank[[x]] <- filter.fdr.fn(rdf) %>% arrange(log2FoldChange) %>% Ranking.fn()

}



# Explore the ranking outputs
head(fdr.rank[[1]])
##                GENEID  baseMean log2FoldChange      lfcSE     stat       pvalue
## 1: ENSMUSG00000026463 1502.8421       1.756275 0.08663102 20.27305 2.224146e-91
## 2: ENSMUSG00000022018  645.8927       1.347227 0.10704067 12.58612 2.517385e-36
## 3: ENSMUSG00000018008 1210.7664       1.402993 0.11422982 12.28219 1.128950e-34
## 4: ENSMUSG00000047180 1007.2280       1.438573 0.11796721 12.19469 3.317803e-34
## 5: ENSMUSG00000020689 2312.6218       1.535357 0.13574123 11.31091 1.158839e-29
## 6: ENSMUSG00000040498  715.3563       2.225236 0.20593692 10.80542 3.244563e-27
##            padj   FDR Alignment Rank
## 1: 3.341112e-87 < 0.1    Salmon    1
## 2: 1.890808e-32 < 0.1    Salmon    2
## 3: 5.653031e-31 < 0.1    Salmon    3
## 4: 1.246001e-30 < 0.1    Salmon    4
## 5: 3.481617e-26 < 0.1    Salmon    5
## 6: 8.123305e-24 < 0.1    Salmon    6
head(lfc.rank[[1]])
##                GENEID  baseMean log2FoldChange     lfcSE      stat       pvalue
## 1: ENSMUSG00000102298 85.752374      -9.903022 1.2015868 -8.241620 1.698947e-16
## 2: ENSMUSG00000103587 27.821960      -8.280479 1.2363032 -6.697774 2.116189e-11
## 3: ENSMUSG00000081228 17.885859       7.574021 1.2698031  5.964721 2.450527e-09
## 4: ENSMUSG00000017002 12.970780       3.822220 0.9846944  3.881631 1.037582e-04
## 5: ENSMUSG00000087412  9.360873       3.714086 1.1626254  3.194568 1.400403e-03
## 6: ENSMUSG00000085328 12.858566      -3.563589 1.1353063 -3.138879 1.695954e-03
##            padj   FDR Alignment Rank
## 1: 1.239003e-13 < 0.1    Salmon    1
## 2: 7.064308e-09 < 0.1    Salmon    2
## 3: 4.231244e-07 < 0.1    Salmon    3
## 4: 4.080252e-03 < 0.1    Salmon    4
## 5: 2.707447e-02 < 0.1    Salmon    5
## 6: 3.078342e-02 < 0.1    Salmon    6
head(up.lfc.rank[[1]])
##                GENEID  baseMean log2FoldChange     lfcSE     stat       pvalue
## 1: ENSMUSG00000081228 17.885859       7.574021 1.2698031 5.964721 2.450527e-09
## 2: ENSMUSG00000017002 12.970780       3.822220 0.9846944 3.881631 1.037582e-04
## 3: ENSMUSG00000087412  9.360873       3.714086 1.1626254 3.194568 1.400403e-03
## 4: ENSMUSG00000025929 39.706078       3.517502 0.7223776 4.869340 1.119719e-06
## 5: ENSMUSG00000083327 10.935695       3.318340 1.1527367 2.878663 3.993655e-03
## 6: ENSMUSG00000037129 21.690990       2.997591 0.6276420 4.775956 1.788555e-06
##            padj   FDR Alignment Rank
## 1: 4.231244e-07 < 0.1    Salmon    1
## 2: 4.080252e-03 < 0.1    Salmon    2
## 3: 2.707447e-02 < 0.1    Salmon    3
## 4: 9.449671e-05 < 0.1    Salmon    4
## 5: 5.639709e-02 < 0.1    Salmon    5
## 6: 1.421570e-04 < 0.1    Salmon    6
head(down.lfc.rank[[1]])
##                GENEID baseMean log2FoldChange     lfcSE      stat       pvalue
## 1: ENSMUSG00000102298 85.75237      -9.903022 1.2015868 -8.241620 1.698947e-16
## 2: ENSMUSG00000103587 27.82196      -8.280479 1.2363032 -6.697774 2.116189e-11
## 3: ENSMUSG00000085328 12.85857      -3.563589 1.1353063 -3.138879 1.695954e-03
## 4: ENSMUSG00000061769 12.38418      -3.307887 0.9143499 -3.617748 2.971777e-04
## 5: ENSMUSG00000043932 15.05964      -3.221949 0.8446794 -3.814405 1.365114e-04
## 6: ENSMUSG00000058626 15.48124      -2.952846 0.9249527 -3.192429 1.410816e-03
##            padj   FDR Alignment Rank
## 1: 1.239003e-13 < 0.1    Salmon    1
## 2: 7.064308e-09 < 0.1    Salmon    2
## 3: 3.078342e-02 < 0.1    Salmon    3
## 4: 8.946299e-03 < 0.1    Salmon    4
## 5: 5.063395e-03 < 0.1    Salmon    5
## 6: 2.717086e-02 < 0.1    Salmon    6

Calculating rank difference between STAR and HISAT2

Salmon was excluded due to extremely small number of genes found

# Set a function rebuilding DE tables with gene ranks 
rankdiff.fn <- function(List){

    # Select columns and join the data frames by gene
    full_join(List[[Aligners[2]]][, .(GENEID, Alignment, Rank, baseMean)], 
              List[[Aligners[3]]][, .(GENEID, Alignment, Rank, baseMean)], 
              by="GENEID") %>%
    
    # Add columns assining gene expression levels, rank differences, and mean ranks
    mutate(logMeanExpression=log(baseMean.x+baseMean.y/2),
           RankDiff=Rank.x-Rank.y, 
           MeanRank=(Rank.x+Rank.y)/2)
} 

# Calculate rank difference by ranking type
fdr.rankdiff <- rankdiff.fn(fdr.rank)
lfc.rankdiff <- rankdiff.fn(lfc.rank)
up.lfc.rankdiff <- rankdiff.fn(up.lfc.rank)
down.lfc.rankdiff <- rankdiff.fn(down.lfc.rank)

# Explore the calculated rank differences
head(fdr.rankdiff)
##                GENEID Alignment.x Rank.x baseMean.x Alignment.y Rank.y
## 1: ENSMUSG00000026463        STAR      1  1562.6504      HISAT2      1
## 2: ENSMUSG00000022018        STAR      2   686.0206      HISAT2      2
## 3: ENSMUSG00000018008        STAR      3  1295.3734      HISAT2      3
## 4: ENSMUSG00000047180        STAR      4  1053.6006      HISAT2      4
## 5: ENSMUSG00000020689        STAR      5  2430.9495      HISAT2      5
## 6: ENSMUSG00000040498        STAR      6   763.7041      HISAT2     41
##    baseMean.y logMeanExpression RankDiff MeanRank
## 1:  1537.4785          7.754220        0      1.0
## 2:   667.0280          6.927102        0      2.0
## 3:  1252.9425          7.561041        0      3.0
## 4:  1046.9434          7.363325        0      4.0
## 5:  2385.4226          8.195240        0      5.0
## 6:   743.7298          7.034889      -35     23.5
head(lfc.rankdiff)
##                GENEID Alignment.x Rank.x baseMean.x Alignment.y Rank.y
## 1: ENSMUSG00000103587        STAR      1   42.32553      HISAT2      1
## 2: ENSMUSG00000020713        STAR      2   11.98750        <NA>     NA
## 3: ENSMUSG00000017002        STAR      3   12.81535      HISAT2      2
## 4: ENSMUSG00000061769        STAR      4   14.59555      HISAT2      3
## 5: ENSMUSG00000025929        STAR      5   41.21467      HISAT2      4
## 6: ENSMUSG00000043932        STAR      6   16.59888      HISAT2      5
##    baseMean.y logMeanExpression RankDiff MeanRank
## 1:   41.48362          4.144203        0      1.0
## 2:         NA                NA       NA       NA
## 3:   12.65005          2.951800        1      2.5
## 4:   14.27835          3.078911        1      3.5
## 5:   41.06315          4.123033        1      4.5
## 6:   16.44317          3.211669        1      5.5
head(up.lfc.rankdiff)
##                GENEID Alignment.x Rank.x baseMean.x Alignment.y Rank.y
## 1: ENSMUSG00000017002        STAR      1   12.81535      HISAT2      1
## 2: ENSMUSG00000025929        STAR      2   41.21467      HISAT2      2
## 3: ENSMUSG00000017897        STAR      3  140.16560      HISAT2      5
## 4: ENSMUSG00000037129        STAR      4   22.03950      HISAT2      3
## 5: ENSMUSG00000034115        STAR      5   14.97109      HISAT2      4
## 6: ENSMUSG00000040653        STAR      6   11.92657      HISAT2      7
##    baseMean.y logMeanExpression RankDiff MeanRank
## 1:   12.65005          2.951800        0      1.0
## 2:   41.06315          4.123033        0      2.0
## 3:  132.92687          5.330925       -2      4.0
## 4:   21.69879          3.493135        1      3.5
## 5:   14.83145          3.108472        1      4.5
## 6:   10.47964          2.842953       -1      6.5
head(down.lfc.rankdiff)
##                GENEID Alignment.x Rank.x baseMean.x Alignment.y Rank.y
## 1: ENSMUSG00000103587        STAR      1   42.32553      HISAT2      1
## 2: ENSMUSG00000020713        STAR      2   11.98750        <NA>     NA
## 3: ENSMUSG00000061769        STAR      3   14.59555      HISAT2      2
## 4: ENSMUSG00000043932        STAR      4   16.59888      HISAT2      3
## 5: ENSMUSG00000058626        STAR      5   17.88988      HISAT2      5
## 6: ENSMUSG00000030178        STAR      6   21.10503      HISAT2      4
##    baseMean.y logMeanExpression RankDiff MeanRank
## 1:   41.48362          4.144203        0      1.0
## 2:         NA                NA       NA       NA
## 3:   14.27835          3.078911        1      2.5
## 4:   16.44317          3.211669        1      3.5
## 5:   17.73726          3.286853        0      5.0
## 6:   19.63052          3.431413        2      5.0

Visualizing DEG rankings

- x-axis: genes aligned by STAR

- y-axis: genes aligned by HISAT2

- Black diagonal lines: ranking in STAR = HISAT2

- Dot color: gene expression level (log-baseMean)

- 409 genes were missing in the plots

# Set a function plotting DEG rankings 
ranking.plot.fn <- function(df, rankedby) {

    ggplot(df, 
           aes(x=Rank.x, y=Rank.y, color=logMeanExpression)) + geom_point(alpha=0.5) + theme_bw() + theme(strip.text.x=element_text(size=10)) + xlab("Ranking in STAR") + ylab("Ranking in HISAT2") + geom_abline(slope=1, color="black", size=0.5) + ggtitle(paste("Gene Ranking by", rankedby)) + scale_color_gradient(low="blue", high="red") 
}

# Plot rankings by ranking type
ranking.plot.fn(fdr.rankdiff, "FDR")

ranking.plot.fn(lfc.rankdiff, "Log2FoldChange")

ranking.plot.fn(up.lfc.rankdiff, "Log2FoldChange (Increase)")

ranking.plot.fn(down.lfc.rankdiff, "Log2FoldChange (Decrease)")

Visualizing DEG ranks II: Rank difference

- x-axis: expression level (log-baseMean)

- y-axis: rank difference (STAR ranking - HISAT2 ranking)

- Black horizontal lines: ranking in STAR = HISAT2

- Dot color: average ranking

- 409 genes were missing in the plots

# Set a function plotting DEG rank difference
rankdiff.plot.fn <- function(df, rankedby, ylim) { 

    ggplot(df, aes(x=logMeanExpression, y=RankDiff, color=MeanRank)) + 
        geom_point(alpha=0.5) + 
        theme_bw() +         
        ylab("Rank Difference (STAR - HISAT2)") + 
        ggtitle(paste("Rank Difference (STAR - HISAT2)\n in", rankedby)) + 
        geom_hline(yintercept=0, color="black", size=0.5) + scale_color_gradient(low="blue", high="red") +
        ylim(-ylim, ylim)

}



# Display the plots in the same y-scale
rankdiff.plot.fn(fdr.rankdiff, "FDR", 1100)

rankdiff.plot.fn(lfc.rankdiff, "Log2FoldChange", 1100)

rankdiff.plot.fn(up.lfc.rankdiff, "Log2FoldChange (Increase)", 1100)

rankdiff.plot.fn(down.lfc.rankdiff, "Log2FoldChange (Decrease)", 1100)

# Display the plots in free y-scale
rankdiff.plot.fn(fdr.rankdiff, "FDR (free y-scale)", 1100)

rankdiff.plot.fn(lfc.rankdiff, "Log2FoldChange (free y-scale)", 500)

rankdiff.plot.fn(up.lfc.rankdiff, "Log2FoldChange (Increase, free y-scale)", 150)

rankdiff.plot.fn(down.lfc.rankdiff, "Log2FoldChange (Decrease, free y-scale)", 250)

Distribution of rank difference

# Create a list storing rankdiff data frames 
rankList <- list(fdr.rankdiff,
                 lfc.rankdiff,
                 up.lfc.rankdiff,
                 down.lfc.rankdiff)


# Assine result column as a factor with levels 
rankdiff.levels <- c("FDR", 
                     "log2FoldChange", 
                     "log2FoldChange.Increase", 
                     "log2FoldChange.Decrease")

# Name the list 
names(rankList) <- rankdiff.levels


# Create a new data frame storing rank difference by result type
rankdiff.dist <- data.frame(FDR=abs(rankList[[1]]$RankDiff), 
                            log2FoldChange=abs(rankList[[2]]$RankDiff),
                            log2FoldChange.Increase=abs(rankList[[3]]$RankDiff),
                            log2FoldChange.Decrease=abs(rankList[[4]]$RankDiff)) %>% gather(Result, RankDiff) 

rankdiff.dist$Result <- factor(rankdiff.dist$Result, levels=rankdiff.levels)

# Plot distribution of absolute rank difference
ggplot(rankdiff.dist,
       aes(x=Result, y=RankDiff, color=Result)) +
geom_jitter(alpha=0.5) + 
geom_boxplot(alpha=0.5, fill="grey", color="black") + 
theme_bw() + 
theme(axis.text.x=element_text(angle=45, hjust=1)) +
ggtitle("Distribution of Absolute Rank Difference without Shrinkage \n (STAR - HISAT2)") + 
ylab("Absolute Rank Difference (STAR - HISAT2)") 

Relationship between rank difference and number of transcript versions

- y-axis: abs(TPM-Count inputs)

- x-axis: number of transcripts (number of transcript id / gene id)

- dot color: mean rank

# Create a data frame storing the number of transcripts by gene id
AnnoDb.ntrans <- AnnoDb %>% 
    group_by(GENEID) %>% 
    summarize(num.trans=n_distinct(TXID))

# Set a function adding the number of transcripts by gene id 
add.ntrans.fn <- function(df) {

    left_join(df, AnnoDb.ntrans, by="GENEID")}




# Add a column indicating the number of transcripts by gene id to every rankdiff data frame
for (x in rankdiff.levels) {

    rankList[[x]] <- add.ntrans.fn(rankList[[x]])
}


# Explore the edited data frames
summary(rankList)
##                         Length Class      Mode
## FDR                     11     data.table list
## log2FoldChange          11     data.table list
## log2FoldChange.Increase 11     data.table list
## log2FoldChange.Decrease 11     data.table list
head(rankList[[1]])
##                GENEID Alignment.x Rank.x baseMean.x Alignment.y Rank.y
## 1: ENSMUSG00000026463        STAR      1  1562.6504      HISAT2      1
## 2: ENSMUSG00000022018        STAR      2   686.0206      HISAT2      2
## 3: ENSMUSG00000018008        STAR      3  1295.3734      HISAT2      3
## 4: ENSMUSG00000047180        STAR      4  1053.6006      HISAT2      4
## 5: ENSMUSG00000020689        STAR      5  2430.9495      HISAT2      5
## 6: ENSMUSG00000040498        STAR      6   763.7041      HISAT2     41
##    baseMean.y logMeanExpression RankDiff MeanRank num.trans
## 1:  1537.4785          7.754220        0      1.0         8
## 2:   667.0280          6.927102        0      2.0         1
## 3:  1252.9425          7.561041        0      3.0        13
## 4:  1046.9434          7.363325        0      4.0         2
## 5:  2385.4226          8.195240        0      5.0         2
## 6:   743.7298          7.034889      -35     23.5         4
head(rankList[[2]])
##                GENEID Alignment.x Rank.x baseMean.x Alignment.y Rank.y
## 1: ENSMUSG00000103587        STAR      1   42.32553      HISAT2      1
## 2: ENSMUSG00000020713        STAR      2   11.98750        <NA>     NA
## 3: ENSMUSG00000017002        STAR      3   12.81535      HISAT2      2
## 4: ENSMUSG00000061769        STAR      4   14.59555      HISAT2      3
## 5: ENSMUSG00000025929        STAR      5   41.21467      HISAT2      4
## 6: ENSMUSG00000043932        STAR      6   16.59888      HISAT2      5
##    baseMean.y logMeanExpression RankDiff MeanRank num.trans
## 1:   41.48362          4.144203        0      1.0         1
## 2:         NA                NA       NA       NA         1
## 3:   12.65005          2.951800        1      2.5         3
## 4:   14.27835          3.078911        1      3.5         1
## 5:   41.06315          4.123033        1      4.5         1
## 6:   16.44317          3.211669        1      5.5         1
head(rankList[[3]])
##                GENEID Alignment.x Rank.x baseMean.x Alignment.y Rank.y
## 1: ENSMUSG00000017002        STAR      1   12.81535      HISAT2      1
## 2: ENSMUSG00000025929        STAR      2   41.21467      HISAT2      2
## 3: ENSMUSG00000017897        STAR      3  140.16560      HISAT2      5
## 4: ENSMUSG00000037129        STAR      4   22.03950      HISAT2      3
## 5: ENSMUSG00000034115        STAR      5   14.97109      HISAT2      4
## 6: ENSMUSG00000040653        STAR      6   11.92657      HISAT2      7
##    baseMean.y logMeanExpression RankDiff MeanRank num.trans
## 1:   12.65005          2.951800        0      1.0         3
## 2:   41.06315          4.123033        0      2.0         1
## 3:  132.92687          5.330925       -2      4.0         6
## 4:   21.69879          3.493135        1      3.5         1
## 5:   14.83145          3.108472        1      4.5         2
## 6:   10.47964          2.842953       -1      6.5         2
head(rankList[[4]])
##                GENEID Alignment.x Rank.x baseMean.x Alignment.y Rank.y
## 1: ENSMUSG00000103587        STAR      1   42.32553      HISAT2      1
## 2: ENSMUSG00000020713        STAR      2   11.98750        <NA>     NA
## 3: ENSMUSG00000061769        STAR      3   14.59555      HISAT2      2
## 4: ENSMUSG00000043932        STAR      4   16.59888      HISAT2      3
## 5: ENSMUSG00000058626        STAR      5   17.88988      HISAT2      5
## 6: ENSMUSG00000030178        STAR      6   21.10503      HISAT2      4
##    baseMean.y logMeanExpression RankDiff MeanRank num.trans
## 1:   41.48362          4.144203        0      1.0         1
## 2:         NA                NA       NA       NA         1
## 3:   14.27835          3.078911        1      2.5         1
## 4:   16.44317          3.211669        1      3.5         1
## 5:   17.73726          3.286853        0      5.0         2
## 6:   19.63052          3.431413        2      5.0         4
# Set a function plotting rank difference vs number of transcripts 
rank.ntrans.plot.fn <- function(df, title) {

    ggplot(df, aes(x=num.trans, y=abs(RankDiff), color=MeanRank)) + 
        geom_jitter(alpha=0.5) + 
        theme_bw() + 
        ggtitle(paste("Rank Difference vs Number of Alternative Transcripts \n in", title)) + 
        xlab("Number of Alternative Transcripts") + 
        ylab("Absolute Rank Difference \n (STAR - HISAT2)") + scale_color_gradient(low="blue", high="red") 
}

# Print the plots
rank.ntrans.plot.fn(rankList[[1]], "FDR")

rank.ntrans.plot.fn(rankList[[2]], "log2FoldChange")

rank.ntrans.plot.fn(rankList[[3]], "log2FoldChange (Increase)")

rank.ntrans.plot.fn(rankList[[4]], "log2FoldChange (Decrease)")

Finding genes having large difference in rankings

# Initialize a list storing rankdiff genes 
large.rankdiff <- rankList

# Assign a vector storing minimum (thresholds) rankdiff for filtering large rankdiff genes
rankdiff.thr <- c(10, 10, 10, 10)

names(rankdiff.thr) <- rankdiff.levels

for (x in rankdiff.levels) {

    # Filter out observations below the rankdiff thresholds
    large.rankdiff[[x]] <- subset(rankList[[x]], 
                                      abs(RankDiff) > rankdiff.thr[x]) %>% 

    # Arrange by descending order of RankDiff
    arrange(desc(abs(RankDiff)))

}

# Explore the filtered genes 
summary(large.rankdiff)
##                         Length Class      Mode
## FDR                     11     data.table list
## log2FoldChange          11     data.table list
## log2FoldChange.Increase 11     data.table list
## log2FoldChange.Decrease 11     data.table list
dim(large.rankdiff[[rankdiff.levels[1]]])
## [1] 1086   11
dim(large.rankdiff[[rankdiff.levels[2]]])
## [1] 847  11
dim(large.rankdiff[[rankdiff.levels[3]]])
## [1] 727  11
dim(large.rankdiff[[rankdiff.levels[4]]])
## [1] 1124   11
head(large.rankdiff[[rankdiff.levels[1]]])
##                GENEID Alignment.x Rank.x baseMean.x Alignment.y Rank.y
## 1: ENSMUSG00000062518        STAR   1461  362.34316      HISAT2    496
## 2: ENSMUSG00000079410        STAR    346  158.20395      HISAT2   1224
## 3: ENSMUSG00000073490        STAR    519   71.15374      HISAT2   1376
## 4: ENSMUSG00000096904        STAR   1349  217.61001      HISAT2    572
## 5: ENSMUSG00000021091        STAR   1408   22.77447      HISAT2    696
## 6: ENSMUSG00000079391        STAR    580  117.17884      HISAT2   1284
##    baseMean.y logMeanExpression RankDiff MeanRank num.trans
## 1:  209.61909          6.146656      965    978.5         1
## 2:  118.83935          5.382767     -878    785.0         1
## 3:   55.27287          4.592998     -857    947.5         2
## 4:  169.03497          5.710849      777    960.5         1
## 5:   18.84633          3.471893      712   1052.0         1
## 6:   95.12945          5.104390     -704    932.0         1
head(large.rankdiff[[rankdiff.levels[2]]])
##                GENEID Alignment.x Rank.x baseMean.x Alignment.y Rank.y
## 1: ENSMUSG00000062518        STAR   1055  362.34316      HISAT2    586
## 2: ENSMUSG00000078498        STAR    902  272.90445      HISAT2    678
## 3: ENSMUSG00000093898        STAR    305   82.54365      HISAT2    499
## 4: ENSMUSG00000095661        STAR    748  284.40779      HISAT2    578
## 5: ENSMUSG00000096793        STAR    572  208.60475      HISAT2    414
## 6: ENSMUSG00000096904        STAR    586  217.61001      HISAT2    429
##    baseMean.y logMeanExpression RankDiff MeanRank num.trans
## 1:   209.6191          6.146656      469    820.5         1
## 2:   144.3794          5.843817      224    790.0         2
## 3:   105.9320          4.909043     -194    402.0         1
## 4:   185.7112          5.932944      170    663.0         2
## 5:   144.8256          5.638417      158    493.0         1
## 6:   169.0350          5.710849      157    507.5         1
head(large.rankdiff[[rankdiff.levels[3]]])
##                GENEID Alignment.x Rank.x baseMean.x Alignment.y Rank.y
## 1: ENSMUSG00000062518        STAR   1002  362.34316      HISAT2   1171
## 2: ENSMUSG00000093898        STAR   1353   82.54365      HISAT2   1209
## 3: ENSMUSG00000079410        STAR   1367  158.20395      HISAT2   1261
## 4: ENSMUSG00000073490        STAR   1396   71.15374      HISAT2   1303
## 5: ENSMUSG00000019817        STAR   1376  623.16765      HISAT2   1294
## 6: ENSMUSG00000028218        STAR   1030  834.59246      HISAT2    956
##    baseMean.y logMeanExpression RankDiff MeanRank num.trans
## 1:  209.61909          6.146656     -169   1086.5         1
## 2:  105.93203          4.909043      144   1281.0         1
## 3:  118.83935          5.382767      106   1314.0         1
## 4:   55.27287          4.592998       93   1349.5         2
## 5:  556.37094          6.803897       82   1335.0        25
## 6:  777.50650          7.109345       74    993.0         8
head(large.rankdiff[[rankdiff.levels[4]]])
##                GENEID Alignment.x Rank.x baseMean.x Alignment.y Rank.y
## 1: ENSMUSG00000062518        STAR    461  362.34316      HISAT2    243
## 2: ENSMUSG00000084817        STAR    784 6016.81813      HISAT2    676
## 3: ENSMUSG00000078498        STAR    393  272.90445      HISAT2    287
## 4: ENSMUSG00000115852        STAR   1032  328.77715      HISAT2    932
## 5: ENSMUSG00000093898        STAR    110   82.54365      HISAT2    205
## 6: ENSMUSG00000060467        STAR    786 6440.19834      HISAT2    697
##    baseMean.y logMeanExpression RankDiff MeanRank num.trans
## 1:   209.6191          6.146656      218    352.0         1
## 2:  4790.6378          9.037431      108    730.0         1
## 3:   144.3794          5.843817      106    340.0         2
## 4:   266.1622          6.135258      100    982.0         4
## 5:   105.9320          4.909043      -95    157.5         1
## 6:  4669.4365          9.079653       89    741.5         1
# Set a function saving rankdiff.csv files
saving.fn <- function(input.df, data.type) { 

    filename <- paste0(DB, "_DE_", data.type, "_RankDiff.csv")

    return(write.csv(input.df, filename))
}



# Save the filtered and arranged data frames as csv files
saving.fn(large.rankdiff[[rankdiff.levels[1]]], "FDR")
saving.fn(large.rankdiff[[rankdiff.levels[2]]], "LFC")
saving.fn(large.rankdiff[[rankdiff.levels[3]]], "LFC_Increase")
saving.fn(large.rankdiff[[rankdiff.levels[4]]], "LFC_Decrease")

Summarizing up/down DEGs with an upset plot

red bar: aligner

blue bar: directionality of gene expression change

# Generate a data frame storing upset input variables
upset.dataframe <- subset(lfc.dataframe, !is.na(padj)) %>%
    
    mutate(Up=ifelse(FDR == paste("<", alpha) & log2FoldChange > 0, GENEID, ""), 

           Down=ifelse(FDR == paste("<", alpha) & log2FoldChange < 0, GENEID, ""),

           Unchanged=ifelse(FDR == paste(">", alpha), GENEID, ""),

           Salmon=ifelse(Alignment == "Salmon", GENEID, ""), 

           STAR=ifelse(Alignment == "STAR", GENEID, ""),

           HISAT2=ifelse(Alignment == "HISAT2", GENEID, ""))

# Generate a list input 
upset.input <- list(Up=upset.dataframe$Up, 
                    Down=upset.dataframe$Down,
                    Unchanged=upset.dataframe$Unchanged, 
                    Salmon=upset.dataframe$Salmon,
                    STAR=upset.dataframe$STAR,
                    HISAT2=upset.dataframe$HISAT2)

# Create an upset plot
upset(fromList(upset.input), 
      sets=names(upset.input),   # What group to display 
      sets.x.label="Number of Genes per Group",
      order.by="freq",
      point.size=3,
      sets.bar.color=c("red", "red", "blue", "red", "blue", "blue"),
      text.scale = 1.5, number.angles=30)

Session Info

sessionInfo()
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-conda-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.2 LTS
## 
## Matrix products: default
## BLAS:   /home/mira/miniconda3/envs/r/lib/libblas.so.3.8.0
## LAPACK: /home/mira/miniconda3/envs/r/lib/liblapack.so.3.8.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats4    parallel  stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] ensembldb_2.14.0            AnnotationFilter_1.14.0    
##  [3] GenomicFeatures_1.42.1      AnnotationDbi_1.52.0       
##  [5] UpSetR_1.4.0                DESeq2_1.30.1              
##  [7] SummarizedExperiment_1.20.0 Biobase_2.50.0             
##  [9] MatrixGenerics_1.2.0        matrixStats_0.58.0         
## [11] GenomicRanges_1.42.0        GenomeInfoDb_1.26.0        
## [13] IRanges_2.24.1              S4Vectors_0.28.1           
## [15] Rsubread_2.4.0              tximport_1.18.0            
## [17] AnnotationHub_2.22.0        BiocFileCache_1.14.0       
## [19] dbplyr_2.1.0                BiocGenerics_0.36.0        
## [21] pheatmap_1.0.12             rmarkdown_2.7              
## [23] forcats_0.5.1               stringr_1.4.0              
## [25] dplyr_1.0.5                 purrr_0.3.4                
## [27] readr_1.4.0                 tidyr_1.1.3                
## [29] tibble_3.1.0                ggplot2_3.3.3              
## [31] tidyverse_1.3.0             data.table_1.14.0          
## 
## loaded via a namespace (and not attached):
##   [1] colorspace_2.0-0              ellipsis_0.3.1               
##   [3] XVector_0.30.0                fs_1.5.0                     
##   [5] rstudioapi_0.13               farver_2.0.3                 
##   [7] bit64_4.0.5                   interactiveDisplayBase_1.28.0
##   [9] fansi_0.4.2                   lubridate_1.7.10             
##  [11] xml2_1.3.2                    splines_4.0.3                
##  [13] cachem_1.0.4                  geneplotter_1.68.0           
##  [15] knitr_1.31                    jsonlite_1.7.2               
##  [17] Rsamtools_2.6.0               broom_0.7.5                  
##  [19] annotate_1.68.0               shiny_1.6.0                  
##  [21] BiocManager_1.30.10           compiler_4.0.3               
##  [23] httr_1.4.2                    backports_1.2.1              
##  [25] lazyeval_0.2.2                assertthat_0.2.1             
##  [27] Matrix_1.3-2                  fastmap_1.1.0                
##  [29] cli_2.3.1                     later_1.1.0.1                
##  [31] prettyunits_1.1.1             htmltools_0.5.1.1            
##  [33] tools_4.0.3                   gtable_0.3.0                 
##  [35] glue_1.4.2                    GenomeInfoDbData_1.2.4       
##  [37] rappdirs_0.3.3                Rcpp_1.0.6                   
##  [39] cellranger_1.1.0              jquerylib_0.1.3              
##  [41] Biostrings_2.58.0             vctrs_0.3.6                  
##  [43] rtracklayer_1.50.0            xfun_0.21                    
##  [45] ps_1.5.0                      rvest_0.3.6                  
##  [47] mime_0.10                     lifecycle_1.0.0              
##  [49] XML_3.99-0.5                  zlibbioc_1.36.0              
##  [51] scales_1.1.1                  ProtGenerics_1.22.0          
##  [53] hms_1.0.0                     promises_1.2.0.1             
##  [55] RColorBrewer_1.1-2            yaml_2.2.1                   
##  [57] curl_4.3                      gridExtra_2.3                
##  [59] memoise_2.0.0                 sass_0.3.1                   
##  [61] biomaRt_2.46.3                stringi_1.5.3                
##  [63] RSQLite_2.2.3                 highr_0.8                    
##  [65] BiocVersion_3.12.0            genefilter_1.72.1            
##  [67] BiocParallel_1.24.0           rlang_0.4.10                 
##  [69] pkgconfig_2.0.3               bitops_1.0-6                 
##  [71] evaluate_0.14                 lattice_0.20-41              
##  [73] labeling_0.4.2                GenomicAlignments_1.26.0     
##  [75] bit_4.0.4                     tidyselect_1.1.0             
##  [77] plyr_1.8.6                    magrittr_2.0.1               
##  [79] R6_2.5.0                      generics_0.1.0               
##  [81] DelayedArray_0.16.0           DBI_1.1.1                    
##  [83] pillar_1.5.0                  haven_2.3.1                  
##  [85] withr_2.4.1                   survival_3.2-7               
##  [87] RCurl_1.98-1.2                modelr_0.1.8                 
##  [89] crayon_1.4.1                  utf8_1.1.4                   
##  [91] progress_1.2.2                locfit_1.5-9.4               
##  [93] grid_4.0.3                    readxl_1.3.1                 
##  [95] blob_1.2.1                    reprex_1.0.0                 
##  [97] digest_0.6.27                 xtable_1.8-4                 
##  [99] httpuv_1.5.5                  openssl_1.4.3                
## [101] munsell_0.5.0                 bslib_0.2.4                  
## [103] askpass_1.1